import requests
import csv
from bs4 import BeautifulSoup
import re

#定义寻找特征字符位置的函数
def find_en(st, text):
    position = []
    n = 0
    for m in range(0, 1000):
        pos = body.find(st, n, len(text))
        n = pos + 1
        position.append(pos)
        if position[m] == -1:
            break
    return position

#定义修正中英文冒号的函数，将中文冒号的QA插入
def correct(datalist1, datalist2):
    if len(datalist1) != 1:
        for m in range(0, len(datalist1)-1):
            for n in range(0, len(datalist2)-1):
                if datalist2[n] > datalist1[m]:
                    datalist2.insert(n, datalist1[m])
                    break

#去除可能出现在A中的小标题
def delete(list1):
    for m in range(0, len(list1)):
        l = list1[m].find('\n\n\n\n')
        if l != -1:
            list1[m] = list1[m][0:l + 1]

if __name__ == "__main__":
    #定义列表和特征字符
    #url_ipynb = ['https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.7/training_visual_design.html']
    url_html = ['https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.common.initializer.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.communication.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.audio.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.config.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.text.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.transforms.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.dataset.vision.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.mindrecord.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.nn.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.nn.probability.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.nn.transformer.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.numpy.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.ops.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.ops.functional.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.parallel.nn.html', 'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.scipy.html', \
                'https://www.mindspore.cn/docs/zh-CN/r1.8/api_python/mindspore.boost.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_NN.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_abstract.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_api.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_api_utils.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_common.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_converter.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_dataset.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_dataset_config.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_dataset_text.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_dataset_transforms.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_dataset_vision.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_datatype.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_format.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_id_generator.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_kernel.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_label_manage.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_ops.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_registry.html', \
                'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_registry_opencl.html', 'https://www.mindspore.cn/lite/api/zh-CN/r1.8/api_cpp/mindspore_tensor.html']

    process = ['mindspore', 'mindspore.common.initializer', 'mindspore.communication', 'mindspore.dataset', 'mindspore.dataset.audio', 'mindspore.dataset.config', 'mindspore.dataset.text', \
               'mindspore.dataset.transforms', 'mindspore.dataset.vision', 'mindspore.mindrecord', 'mindspore.nn', 'mindspore.nn.probability', 'mindspore.nn.transformer', \
               'mindspore.numpy', 'mindspore.ops', 'mindspore.ops.functional', 'mindspore.parallel.nn', 'mindspore.scipy', 'mindspore.boost', 'mindspore', \
               'mindspore::NN', 'mindspore::abstract', 'mindspore::api', 'mindspore::api::utils', 'mindspore::common', 'mindspore::converter', \
               'mindspore::dataset', 'mindspore::dataset::config', 'mindspore::dataset::text', 'mindspore::dataset::transforms', 'mindspore::dataset::vision', \
               'mindspore::DataType', 'mindspore::Format', 'mindspore::id_generator', 'mindspore::kernel', 'mindspore::label_manage', 'mindspore::ops', \
               'mindspore::registry', 'mindspore::registry::opencl', 'mindspore::tensor']

    tag = ['白皮书', 'MindSpore', '全场景', '自动微分', 'GradOperation', 'MindSpore', 'Parameter Server', \
           '并行', '优化器', 'Host&Device', '分布式', '算子', '接口', '数据集', '集合通信', '同步模式', '损失函数', '控制流', \
           '自由变量', '闭包', 'MindRecord', 'Pipeline', 'Adaptor', 'Optimizer', 'Runtime', 'Operators', 'Callback', \
           'MindSpore AKG', 'Auto-Tiling', 'Auto-Mapping', 'Davinci', '样例脚本', 'GPU', 'CPU', 'CuBLAS库', 'Linux', 'Ascend', \
           '模型推理', 'MindInsight', 'plugin_name', 'MindArmour', 'AI Fuzzer', 'MINDIR', '缩略语', \
           '数据类型', '原型', '运算符', '语句', '内置函数', '网络', \
           'PyNative', 'MindConverter', '评估', 'TensorFlow', 'Pytorch', 'ONNX', 'PB', 'API', 'Model', 'SGD', 'loss', 'metric', 'ARM', 'AST', \
           'TensorBoard', 'Dataloader', 'GeneratorDataset', ' GradOperation', 'TrainOneStepCell', '反向传播', '求导', 'weight decay', 'LR', '学习率', \
           'Aten', '调试', 'MindOptimizer', '精度', 'checklist', '数据', 'loss scale', '超参', '计算图', '精度', '可视化', '模型结构', \
           '优化', '迭代', 'ModelZoo', 'MindSpore Serving', '脚本', '单机', 'profiling', 'MindData', '框架', '性能', 'Dataset', 'macOS', 'SDK版本', 'SciPy',\
           'whl包', 'protobuf', 'Ubuntu', 'Windows', 'WSL', 'Conda', 'Serving', 'gmp', 'cuda', 'JupyterLab',  'eval', 'NLP', 'HCLL', 'ModelArts', 'Graph', \
           'C++', 'AIPP', 'OpenMPI', 'NCLL', 'RDMA', 'IB', 'RoCE', 'taichi', 'Caffe', 'NPU', 'Embedding', 'Parameter Server', 'JIT Fallback']
    tag_change = []
    Dataset = []
    #转义tag中的特殊符号
    for i in tag:
        tag_change.append(re.escape(i))
    #十一个语料库大循环
    for numbers in range(0, len(process)):
        #获取网页文本
        req = requests.get(url=url_html[numbers])
        req.encoding = 'utf-8'
        html = req.text
        bs = BeautifulSoup(html, "html.parser")#html.parser是解析器
        All_title = bs.find_all("div", attrs={"class": "section"})#所有标题

        #定义二级标题及其内容
        pre_title_2 = []#二级标题
        pre_title_1 = []#一级标题
        title = []
        content = []

        #获取所有二级标题
        for one_title in All_title:
            h_1 = one_title.find('h1')
            h_2 = one_title.find('h2')
            if h_1 != None:
                #标题有特殊符号需要替换
                flag_a = "::" in process[numbers]
                flag_b = "_" in process[numbers]
                mis_process = process[numbers]
                if flag_a == True:
                    mis_process = mis_process.replace('::', '-')
                if flag_b == True:
                    mis_process = mis_process.replace('_', '-')
                pre_title_1.append(mis_process.lower())# 一级标题名字，大写字母需要转换成小写字母

            if h_1 == None and h_2 != None:
                title_2 = h_2.get_text()#二级标题名字
                pre_title_2.append(title_2)

        #提取一级标题和二级标题之间的内容
        # 针对API模块的一级标题中"."
        flag_0 = "." in pre_title_1[0]
        if flag_0 == True:
            mis_title_1 = pre_title_1[0].replace(".", "-")
        else:
            mis_title_1 = pre_title_1[0]
        result = bs.select("#" + mis_title_1)
        body = result[0].get_text()
        title_1_location = find_en("¶", body)
        # 有二级标题 如："二阶优化器THOR介绍" or "训练可视总体设计"
        if len(pre_title_2) > 0:
            title_2_location = find_en(pre_title_2[0], body)
            body_1 = body[title_1_location[0] + 1: title_2_location[0]]
            # 判断一级标题到二级标题间有无内容
            if len(body_1) > 20:
                title.append(pre_title_1[0] + "§" + pre_title_1[0])
                content.append(body_1)
        # 没有二级标题 如："全场景统一" or "术语"
        else:
            body_1 = body[title_1_location[0] + 1: ]
            title.append(pre_title_1[0] + "§" + pre_title_1[0])
            content.append(body_1)

        #提取二级标题及其内容
        if len(pre_title_2) > 0:
            for title_2 in pre_title_2:
                title_mis = title_2.rstrip('¶')
                # 标题有特殊符号需要替换
                flag_1 = "&" in title_mis
                flag_2 = " " in title_mis
                flag_3 = "（" in title_mis
                flag_4 = "." in title_mis
                flag_5 = "_" in title_mis
                if flag_1 == True:
                    title_mis = title_mis.replace("&", "")
                if flag_2 == True:
                    title_mis = title_mis.replace(" ", "-")
                if flag_3 == True:
                    title_mis = title_mis.replace("（", "")
                    title_mis = title_mis.replace("）", "")
                if flag_4 == True:
                    title_mis = title_mis.replace(".", "-")
                if flag_5 == True:
                    title_mis = title_mis.replace("_", "-")
                title_mis = title_mis.lower()
                result = bs.select("#" + title_mis)
                # if result == []:
                #     print(process[numbers])
                #     print(title_mis)
                body = result[0].get_text()
                # if pre_title_1[0] == "mindspore.numpy":
                #     print(pre_title_2)
                #     print(body)
                title_2_location = find_en(title_2, body)
                body_1 = body[title_2_location[0] + len(title_2):]
                body_1 = body_1.replace('¶', '')
                title_2 = title_2.rstrip('¶')
                title.append(pre_title_1[0] + "§" + title_2)
                content.append(body_1)

                # 匹配tag
                tag1 = []
                tag2 = []
                for m in range(0, len(title)):
                    str8 = title[m] + content[m]
                    for i in tag_change:
                        t = re.search(i, str8, flags=re.IGNORECASE)
                        if t != None:
                            tag2.append(t.group(0))
                    tag1.append(tag2)
                    tag2 = []
                # 将tag列表合并为字符串
                FinalTag = []
                for m in range(0, len(title)):
                    ft_str = '、'.join(tag1[m])
                    FinalTag.append(ft_str)

        # 将各个二级标题元素合并，用于写入csv文件
        for n in range(0, len(title)):
            Dataset.append([title[n], url_html[numbers], content[n], process[numbers], FinalTag[n]])

    # 插入csv
    headers = ('title', 'link', 'content', 'process', 'tag')
    with open('html_api_data_dataset.csv', 'w', encoding='utf-8', newline='') as f:
        write = csv.writer(f)  # 创建writer对象
        write.writerow(headers)
        for n in range(len(Dataset)):
            write.writerow(Dataset[n])
